import jieba
import os

# csdn cut words from csdn_orgin_path to csdn_save_cut_path
def csdn_cut_words(csdn_orgin_path, csdn_save_cut_path, stop_words_path=None):
    all_paths = os.listdir(csdn_orgin_path)
    all_filenames = all_paths
    all_paths = [os.path.join(csdn_orgin_path, path) for path in all_paths]

    if stop_words_path:
        stop_words_dict = {}.fromkeys([word.strip() for word in open(stop_words_path, 'r', encoding='utf-8').readlines()])
        print('stop_words_dict: ', stop_words_dict)

    for index, path in enumerate(all_paths):
        with open(path, 'r', encoding='utf-8') as fr:
            with open(os.path.join(csdn_save_cut_path, all_filenames[index]), 'w+', encoding='utf-8') as fw:
                content = fr.read()
                cut_word_list = jieba.lcut(content, cut_all=False)   # Accurate model
                cut_word_list = [word.strip() for word in cut_word_list]

                # handle stop_words
                if stop_words_path:
                    final = ''
                    for word in cut_word_list:
                        if word not in stop_words_dict:
                            final += word
                    cut_word_list = jieba.lcut(final, cut_all=False)
                    print('final_cut_word_list: ', cut_word_list)

                fw.write(' '.join(cut_word_list))

if __name__ == '__main__':
    csdn_orgin_path = r'E:\NLP1\骚操作\自学分词\csdn语料库训练\csdn_spider_datas'
    csdn_save_cut_path = r'E:\NLP1\骚操作\自学分词\csdn语料库训练\csdn_cut_datas'
    stop_words_path = r'E:\NLP1\骚操作\自学分词\csdn语料库训练\csdn_stop_words\stopwords.txt'
    csdn_cut_words(csdn_orgin_path, csdn_save_cut_path, stop_words_path=stop_words_path)


